{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.7"
    },
    "colab": {
      "name": "7-6_Transformer.ipynb",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D1PpwmcMJFRB",
        "colab_type": "text"
      },
      "source": [
        "# 7.6  Transformerモデル（分類タスク用）の実装\n",
        "\n",
        "- 本ファイルでは、クラス分類のTransformerモデルを実装します。\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0aSx2BwBJFRC",
        "colab_type": "text"
      },
      "source": [
        "※　本章のファイルはすべてUbuntuでの動作を前提としています。Windowsなど文字コードが違う環境での動作にはご注意下さい。"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VD2JfFFgJFRD",
        "colab_type": "text"
      },
      "source": [
        "# 7.6 学習目標\n",
        "\n",
        "1.\tTransformerのモジュール構成を理解する\n",
        "2.\tLSTMやRNNを使用せずCNNベースのTransformerで自然言語処理が可能な理由を理解する\n",
        "3.\tTransformerを実装できるようになる\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3IBOjXDeJFRE",
        "colab_type": "text"
      },
      "source": [
        "# 事前準備\n",
        "書籍の指示に従い、本章で使用するデータを用意します\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "w1kMABqAJFRF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import math\n",
        "import numpy as np\n",
        "import random\n",
        "\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F \n",
        "import torchtext"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "U2wc_IbWJFRI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Setup seeds\n",
        "torch.manual_seed(1234)\n",
        "np.random.seed(1234)\n",
        "random.seed(1234)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iAB9spQJJFRM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class Embedder(nn.Module):\n",
        "    '''idで示されている単語をベクトルに変換します'''\n",
        "\n",
        "    def __init__(self, text_embedding_vectors):\n",
        "        super(Embedder, self).__init__()\n",
        "\n",
        "        self.embeddings = nn.Embedding.from_pretrained(\n",
        "            embeddings=text_embedding_vectors, freeze=True)\n",
        "        # freeze=Trueによりバックプロパゲーションで更新されず変化しなくなります\n",
        "\n",
        "    def forward(self, x):\n",
        "        x_vec = self.embeddings(x)\n",
        "\n",
        "        return x_vec\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "loHdRkRaJFRQ",
        "colab_type": "code",
        "colab": {},
        "outputId": "c584d9db-3cfe-4ea3-b1cf-6f5cf1c684fa"
      },
      "source": [
        "# 動作確認\n",
        "\n",
        "# 前節のDataLoaderなどを取得\n",
        "from utils.dataloader import get_IMDb_DataLoaders_and_TEXT\n",
        "train_dl, val_dl, test_dl, TEXT = get_IMDb_DataLoaders_and_TEXT(\n",
        "    max_length=256, batch_size=24)\n",
        "\n",
        "# ミニバッチの用意\n",
        "batch = next(iter(train_dl))\n",
        "\n",
        "# モデル構築\n",
        "net1 = Embedder(TEXT.vocab.vectors)\n",
        "\n",
        "# 入出力\n",
        "x = batch.Text[0]\n",
        "x1 = net1(x)  # 単語をベクトルに\n",
        "\n",
        "print(\"入力のテンソルサイズ：\", x.shape)\n",
        "print(\"出力のテンソルサイズ：\", x1.shape)\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "入力のテンソルサイズ： torch.Size([24, 256])\n",
            "出力のテンソルサイズ： torch.Size([24, 256, 300])\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "S-R3xrLWJFRU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class PositionalEncoder(nn.Module):\n",
        "    '''入力された単語の位置を示すベクトル情報を付加する'''\n",
        "\n",
        "    def __init__(self, d_model=300, max_seq_len=256):\n",
        "        super().__init__()\n",
        "\n",
        "        self.d_model = d_model  # 単語ベクトルの次元数\n",
        "\n",
        "        # 単語の順番（pos）と埋め込みベクトルの次元の位置（i）によって一意に定まる値の表をpeとして作成\n",
        "        pe = torch.zeros(max_seq_len, d_model)\n",
        "\n",
        "        # GPUが使える場合はGPUへ送る、ここでは省略。実際に学習時には使用する\n",
        "        # device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
        "        # pe = pe.to(device)\n",
        "\n",
        "        for pos in range(max_seq_len):\n",
        "            for i in range(0, d_model, 2):\n",
        "                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))\n",
        "                \n",
        "                # 誤植修正_200510 #79\n",
        "                # pe[pos, i + 1] = math.cos(pos /\n",
        "                #                          (10000 ** ((2 * (i + 1))/d_model)))\n",
        "                pe[pos, i + 1] = math.cos(pos /\n",
        "                                          (10000 ** ((2 * i)/d_model)))\n",
        "\n",
        "        # 表peの先頭に、ミニバッチ次元となる次元を足す\n",
        "        self.pe = pe.unsqueeze(0)\n",
        "\n",
        "        # 勾配を計算しないようにする\n",
        "        self.pe.requires_grad = False\n",
        "\n",
        "    def forward(self, x):\n",
        "\n",
        "        # 入力xとPositonal Encodingを足し算する\n",
        "        # xがpeよりも小さいので、大きくする\n",
        "        ret = math.sqrt(self.d_model)*x + self.pe\n",
        "        return ret\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SZoqYmiiJFRX",
        "colab_type": "code",
        "colab": {},
        "outputId": "6f3815a2-2a4a-4bf6-bad9-0ccc6fe69821"
      },
      "source": [
        "# 動作確認\n",
        "\n",
        "# モデル構築\n",
        "net1 = Embedder(TEXT.vocab.vectors)\n",
        "net2 = PositionalEncoder(d_model=300, max_seq_len=256)\n",
        "\n",
        "# 入出力\n",
        "x = batch.Text[0]\n",
        "x1 = net1(x)  # 単語をベクトルに\n",
        "x2 = net2(x1)\n",
        "\n",
        "print(\"入力のテンソルサイズ：\", x1.shape)\n",
        "print(\"出力のテンソルサイズ：\", x2.shape)\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "入力のテンソルサイズ： torch.Size([24, 256, 300])\n",
            "出力のテンソルサイズ： torch.Size([24, 256, 300])\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i8wcSoeDJFRa",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class Attention(nn.Module):\n",
        "    '''Transformerは本当はマルチヘッドAttentionですが、\n",
        "    分かりやすさを優先しシングルAttentionで実装します'''\n",
        "\n",
        "    def __init__(self, d_model=300):\n",
        "        super().__init__()\n",
        "\n",
        "        # SAGANでは1dConvを使用したが、今回は全結合層で特徴量を変換する\n",
        "        self.q_linear = nn.Linear(d_model, d_model)\n",
        "        self.v_linear = nn.Linear(d_model, d_model)\n",
        "        self.k_linear = nn.Linear(d_model, d_model)\n",
        "\n",
        "        # 出力時に使用する全結合層\n",
        "        self.out = nn.Linear(d_model, d_model)\n",
        "\n",
        "        # Attentionの大きさ調整の変数\n",
        "        self.d_k = d_model\n",
        "\n",
        "    def forward(self, q, k, v, mask):\n",
        "        # 全結合層で特徴量を変換\n",
        "        k = self.k_linear(k)\n",
        "        q = self.q_linear(q)\n",
        "        v = self.v_linear(v)\n",
        "\n",
        "        # Attentionの値を計算する\n",
        "        # 各値を足し算すると大きくなりすぎるので、root(d_k)で割って調整\n",
        "        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_k)\n",
        "\n",
        "        # ここでmaskを計算\n",
        "        mask = mask.unsqueeze(1)\n",
        "        weights = weights.masked_fill(mask == 0, -1e9)\n",
        "\n",
        "        # softmaxで規格化をする\n",
        "        normlized_weights = F.softmax(weights, dim=-1)\n",
        "\n",
        "        # AttentionをValueとかけ算\n",
        "        output = torch.matmul(normlized_weights, v)\n",
        "\n",
        "        # 全結合層で特徴量を変換\n",
        "        output = self.out(output)\n",
        "\n",
        "        return output, normlized_weights\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "e1AcDAfjJFRe",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class FeedForward(nn.Module):\n",
        "    def __init__(self, d_model, d_ff=1024, dropout=0.1):\n",
        "        '''Attention層から出力を単純に全結合層2つで特徴量を変換するだけのユニットです'''\n",
        "        super().__init__()\n",
        "\n",
        "        self.linear_1 = nn.Linear(d_model, d_ff)\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "        self.linear_2 = nn.Linear(d_ff, d_model)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x = self.linear_1(x)\n",
        "        x = self.dropout(F.relu(x))\n",
        "        x = self.linear_2(x)\n",
        "        return x\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JQ5Y2sDDJFRh",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class TransformerBlock(nn.Module):\n",
        "    def __init__(self, d_model, dropout=0.1):\n",
        "        super().__init__()\n",
        "\n",
        "        # LayerNormalization層\n",
        "        # https://pytorch.org/docs/stable/nn.html?highlight=layernorm\n",
        "        self.norm_1 = nn.LayerNorm(d_model)\n",
        "        self.norm_2 = nn.LayerNorm(d_model)\n",
        "\n",
        "        # Attention層\n",
        "        self.attn = Attention(d_model)\n",
        "\n",
        "        # Attentionのあとの全結合層2つ\n",
        "        self.ff = FeedForward(d_model)\n",
        "\n",
        "        # Dropout\n",
        "        self.dropout_1 = nn.Dropout(dropout)\n",
        "        self.dropout_2 = nn.Dropout(dropout)\n",
        "\n",
        "    def forward(self, x, mask):\n",
        "        # 正規化とAttention\n",
        "        x_normlized = self.norm_1(x)\n",
        "        output, normlized_weights = self.attn(\n",
        "            x_normlized, x_normlized, x_normlized, mask)\n",
        "        \n",
        "        x2 = x + self.dropout_1(output)\n",
        "\n",
        "        # 正規化と全結合層\n",
        "        x_normlized2 = self.norm_2(x2)\n",
        "        output = x2 + self.dropout_2(self.ff(x_normlized2))\n",
        "\n",
        "        return output, normlized_weights\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KNaVWuNJJFRk",
        "colab_type": "code",
        "colab": {},
        "outputId": "e20710fc-c547-48e0-831d-3aef9d53c367"
      },
      "source": [
        "# 動作確認\n",
        "\n",
        "# モデル構築\n",
        "net1 = Embedder(TEXT.vocab.vectors)\n",
        "net2 = PositionalEncoder(d_model=300, max_seq_len=256)\n",
        "net3 = TransformerBlock(d_model=300)\n",
        "\n",
        "# maskの作成\n",
        "x = batch.Text[0]\n",
        "input_pad = 1  # 単語のIDにおいて、'<pad>': 1 なので\n",
        "input_mask = (x != input_pad)\n",
        "print(input_mask[0])\n",
        "\n",
        "# 入出力\n",
        "x1 = net1(x)  # 単語をベクトルに\n",
        "x2 = net2(x1)  # Positon情報を足し算\n",
        "x3, normlized_weights = net3(x2, input_mask)  # Self-Attentionで特徴量を変換\n",
        "\n",
        "print(\"入力のテンソルサイズ：\", x2.shape)\n",
        "print(\"出力のテンソルサイズ：\", x3.shape)\n",
        "print(\"Attentionのサイズ：\", normlized_weights.shape)\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
            "        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
            "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
            "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
            "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.uint8)\n",
            "入力のテンソルサイズ： torch.Size([24, 256, 300])\n",
            "出力のテンソルサイズ： torch.Size([24, 256, 300])\n",
            "Attentionのサイズ： torch.Size([24, 256, 256])\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pcrNM1rpJFRn",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class ClassificationHead(nn.Module):\n",
        "    '''Transformer_Blockの出力を使用し、最後にクラス分類させる'''\n",
        "\n",
        "    def __init__(self, d_model=300, output_dim=2):\n",
        "        super().__init__()\n",
        "\n",
        "        # 全結合層\n",
        "        self.linear = nn.Linear(d_model, output_dim)  # output_dimはポジ・ネガの2つ\n",
        "\n",
        "        # 重み初期化処理\n",
        "        nn.init.normal_(self.linear.weight, std=0.02)\n",
        "        nn.init.normal_(self.linear.bias, 0)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x0 = x[:, 0, :]  # 各ミニバッチの各文の先頭の単語の特徴量（300次元）を取り出す\n",
        "        out = self.linear(x0)\n",
        "\n",
        "        return out\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WUVS-wxoJFRq",
        "colab_type": "code",
        "colab": {},
        "outputId": "e52ae2a8-1e44-45ef-d00e-0ae01b4f1ad5"
      },
      "source": [
        "# 動作確認\n",
        "\n",
        "# ミニバッチの用意\n",
        "batch = next(iter(train_dl))\n",
        "\n",
        "# モデル構築\n",
        "net1 = Embedder(TEXT.vocab.vectors)\n",
        "net2 = PositionalEncoder(d_model=300, max_seq_len=256)\n",
        "net3 = TransformerBlock(d_model=300)\n",
        "net4 = ClassificationHead(output_dim=2, d_model=300)\n",
        "\n",
        "# 入出力\n",
        "x = batch.Text[0]\n",
        "x1 = net1(x)  # 単語をベクトルに\n",
        "x2 = net2(x1)  # Positon情報を足し算\n",
        "x3, normlized_weights = net3(x2, input_mask)  # Self-Attentionで特徴量を変換\n",
        "x4 = net4(x3)  # 最終出力の0単語目を使用して、分類0-1のスカラーを出力\n",
        "\n",
        "print(\"入力のテンソルサイズ：\", x3.shape)\n",
        "print(\"出力のテンソルサイズ：\", x4.shape)\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "入力のテンソルサイズ： torch.Size([24, 256, 300])\n",
            "出力のテンソルサイズ： torch.Size([24, 2])\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "k8-NHGc3JFRt",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# 最終的なTransformerモデルのクラス\n",
        "\n",
        "\n",
        "class TransformerClassification(nn.Module):\n",
        "    '''Transformerでクラス分類させる'''\n",
        "\n",
        "    def __init__(self, text_embedding_vectors, d_model=300, max_seq_len=256, output_dim=2):\n",
        "        super().__init__()\n",
        "\n",
        "        # モデル構築\n",
        "        self.net1 = Embedder(text_embedding_vectors)\n",
        "        self.net2 = PositionalEncoder(d_model=d_model, max_seq_len=max_seq_len)\n",
        "        self.net3_1 = TransformerBlock(d_model=d_model)\n",
        "        self.net3_2 = TransformerBlock(d_model=d_model)\n",
        "        self.net4 = ClassificationHead(output_dim=output_dim, d_model=d_model)\n",
        "\n",
        "    def forward(self, x, mask):\n",
        "        x1 = self.net1(x)  # 単語をベクトルに\n",
        "        x2 = self.net2(x1)  # Positon情報を足し算\n",
        "        x3_1, normlized_weights_1 = self.net3_1(\n",
        "            x2, mask)  # Self-Attentionで特徴量を変換\n",
        "        x3_2, normlized_weights_2 = self.net3_2(\n",
        "            x3_1, mask)  # Self-Attentionで特徴量を変換\n",
        "        x4 = self.net4(x3_2)  # 最終出力の0単語目を使用して、分類0-1のスカラーを出力\n",
        "        return x4, normlized_weights_1, normlized_weights_2\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "g2dt7_rvJFRv",
        "colab_type": "code",
        "colab": {},
        "outputId": "9b5659f4-2d7c-4a7d-e3ae-f2c1b0703750"
      },
      "source": [
        "# 動作確認\n",
        "\n",
        "# ミニバッチの用意\n",
        "batch = next(iter(train_dl))\n",
        "\n",
        "# モデル構築\n",
        "net = TransformerClassification(\n",
        "    text_embedding_vectors=TEXT.vocab.vectors, d_model=300, max_seq_len=256, output_dim=2)\n",
        "\n",
        "# 入出力\n",
        "x = batch.Text[0]\n",
        "input_mask = (x != input_pad)\n",
        "out, normlized_weights_1, normlized_weights_2 = net(x, input_mask)\n",
        "\n",
        "print(\"出力のテンソルサイズ：\", out.shape)\n",
        "print(\"出力テンソルのsigmoid：\", F.softmax(out, dim=1))\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "出力のテンソルサイズ： torch.Size([24, 2])\n",
            "出力テンソルのsigmoid： tensor([[0.6980, 0.3020],\n",
            "        [0.7318, 0.2682],\n",
            "        [0.7244, 0.2756],\n",
            "        [0.7135, 0.2865],\n",
            "        [0.7022, 0.2978],\n",
            "        [0.6974, 0.3026],\n",
            "        [0.6831, 0.3169],\n",
            "        [0.6487, 0.3513],\n",
            "        [0.7096, 0.2904],\n",
            "        [0.7221, 0.2779],\n",
            "        [0.7213, 0.2787],\n",
            "        [0.7046, 0.2954],\n",
            "        [0.6738, 0.3262],\n",
            "        [0.7069, 0.2931],\n",
            "        [0.7217, 0.2783],\n",
            "        [0.6837, 0.3163],\n",
            "        [0.7011, 0.2989],\n",
            "        [0.6944, 0.3056],\n",
            "        [0.6860, 0.3140],\n",
            "        [0.7183, 0.2817],\n",
            "        [0.7256, 0.2744],\n",
            "        [0.7288, 0.2712],\n",
            "        [0.6678, 0.3322],\n",
            "        [0.7253, 0.2747]], grad_fn=<SoftmaxBackward>)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MSSYMFxdJFRx",
        "colab_type": "text"
      },
      "source": [
        "ここまでの内容をフォルダ「utils」のtransformer.pyに別途保存しておき、次節からはこちらから読み込むようにします"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GLLg9VGIJFRx",
        "colab_type": "text"
      },
      "source": [
        "以上"
      ]
    }
  ]
}